library(Seurat)
library(tidyverse)
library(magrittr)
library(ggplot2)
library(patchwork)
library(sctransform)
library(cowplot)
library(harmony)

Step 1: Load 10X filtered matrices into seurat

1.1 set up the Seurat objects

#Prior to load the data, please make sure the gene feature names shoud use uppercases (especiall for mouse).
samples <- list.files("/Users/zhuliu/Desktop/scRNA_STseq/proj_10X_woundhealing/03_results/00-cellrange_counts/")
sm_folders <- paste0("/Users/zhuliu/Desktop/scRNA_STseq/proj_10X_woundhealing/03_results/00-cellrange_counts/", samples, "/outs/filtered_feature_bc_matrix")

sceList <- list()
for (i in seq_along(samples)) {
  tmp_sms <- samples[i]
  sceList[[tmp_sms]] <- CreateSeuratObject(
                                          counts = Read10X(sm_folders[i]),
                                          min.cells = 3, # filter this later
                                          min.features = 200, # filter this later
                                          project = tmp_sms)
  
}

# filtering by nCount and nFeatures per individual
filterCell <- function(combined){
  # calculate the quantile range
  count.feature.ls <- combined@meta.data[, c("nCount_RNA", "nFeature_RNA")]
  count.feature.ls %<>% map(log10) %>% map(~c(10^(mean(.x) + 3*sd(.x)), 10^(mean(.x) - 3*sd(.x))))

  # filter cells
  combined <- subset(combined, subset = nFeature_RNA > 200 & 
                     nFeature_RNA < count.feature.ls[[2]][1] & 
                     nCount_RNA < count.feature.ls[[1]][1]) 
  return(combined)
}
sceList %<>% map(filterCell)

hswound.com <- merge(sceList[[1]], 
                      y = c(sceList[[2]], sceList[[3]], sceList[[4]], sceList[[5]], sceList[[6]],
                                  sceList[[7]], sceList[[8]], sceList[[9]], sceList[[10]], sceList[[11]], sceList[[12]]), 
                      add.cell.ids = samples, 
                      project = "humanWounds")
hswound.com #
head(colnames(hswound.com)); tail(colnames(hswound.com))
#cell numbers of each sample
table(Idents(hswound.com))
rm(sceList) #remove individual seurat objects

1.2 add sample level metadata (Age, Sex, etc)

metadata <- readxl::read_xlsx("/Users/zhuliu/Desktop/scRNA_STseq/proj_10X_woundhealing/02_metadata/Metadata_STsc.xlsx") %>% slice(17:28)
metadata <- metadata %>% select(3:6,11) %>% mutate(User_ID = gsub("_","",User_ID)) %>% rename(ID=User_ID)
metadata <- hswound.com@meta.data %>% left_join(., metadata, by=c("orig.ident" = "ID"))
rownames(metadata) <- Cells(hswound.com)
hswound.com <- AddMetaData(hswound.com, metadata = metadata)
saveRDS(hswound.com, file = "../../00_Origin_objects/s0_original_seurat_object.rds")

Original cell numbers for each sample

## An object of class Seurat 
## 27973 features across 65462 samples within 1 assay 
## Active assay: RNA (27973 features, 0 variable features)
## 
##  PWH26D0  PWH26D1 PWH26D30  PWH26D7  PWH27D0  PWH27D1 PWH27D30  PWH27D7 
##     4702     5035     3692     4815     3597     7280     6605     7338 
##  PWH28D0  PWH28D1 PWH28D30  PWH28D7 
##     5038     4973     6414     5973
##           used  (Mb) gc trigger   (Mb) limit (Mb)  max used   (Mb)
## Ncells 2752904 147.1    4899647  261.7         NA   4899647  261.7
## Vcells 4716516  36.0  712318396 5434.6      32768 741703711 5658.8

1.3 load file and filter out doublets for following quality control

hswound.com <- readRDS("../00_Origin_objects/s0_original_seurat_object.rds")
hswound.com
table(hswound.com$orig.ident)

#filter out the doublets
doublets <- list.files("/Users/zhuliu/Desktop/scRNA_STseq/proj_10X_woundhealing/03_results/01-Seurat-PreAnalysis/00_Seurat_qc_Doublets", pattern = ".txt$")
doublets.path <- paste0("../00_Seurat_qc_Doublets/", doublets)
doublets_files <- lapply(doublets.path, FUN = function(x){
  data.table::fread(x)
})
names(doublets_files) <- gsub("(Shared_doublets_)|(\\.txt)", "", doublets) 
doublets_com <- do.call("rbind", doublets_files) %>% 
  mutate(Doublet = "Doublet")
rm(doublets, doublets.path, doublets_files)
table(doublets_com$sampleID)

metadata <- hswound.com@meta.data %>% rownames_to_column(var = "mapID") %>% 
  left_join(., doublets_com[,c(3:4)], by=c("mapID" = "mapID")) %>% 
  column_to_rownames(var = "mapID") %>% 
  mutate(Doublet = ifelse(is.na(Doublet), "Singlet", Doublet))

hswound.com <- AddMetaData(hswound.com, metadata = metadata)
#Extract the singlet cells
hswound.com <- subset(hswound.com, subset = Doublet == "Singlet")
table(hswound.com$Doublet)
rm(metadata, doublets_com)
saveRDS(hswound.com, file = "../00_Origin_objects/s0_original_seurat_object_noDoublets.rds") 

Cell numbers for each sample after removing doublets

## An object of class Seurat 
## 27973 features across 64056 samples within 1 assay 
## Active assay: RNA (27973 features, 0 variable features)
## 
##  PWH26D0  PWH26D1 PWH26D30  PWH26D7  PWH27D0  PWH27D1 PWH27D30  PWH27D7 
##     4570     4928     3648     4784     3534     7084     6347     7034 
##  PWH28D0  PWH28D1 PWH28D30  PWH28D7 
##     4911     4935     6368     5913

Step 2: Quality Control

2.1 calculate QC

#ratio of mitochondrial genes
hswound.com[["percent.mt"]] <- PercentageFeatureSet(object = hswound.com, pattern = "^MT-")
rownames(hswound.com)[grep("^MT-", rownames(hswound.com))]
##  [1] "MT-ND1"  "MT-ND2"  "MT-CO1"  "MT-CO2"  "MT-ATP8" "MT-ATP6" "MT-CO3" 
##  [8] "MT-ND3"  "MT-ND4L" "MT-ND4"  "MT-ND5"  "MT-ND6"  "MT-CYB"
#ratio of ribosomal genes
hswound.com[["percent.ribo"]] <- PercentageFeatureSet(object = hswound.com, pattern = "^RP[SL]")
rownames(hswound.com)[grep("^RP[SL]", rownames(hswound.com))]
##   [1] "RPL22"       "RPL11"       "RPS6KA1"     "RPS8"        "RPL5"       
##   [6] "RPS27"       "RPS27AP5"    "RPS6KC1"     "RPS7"        "RPS27A"     
##  [11] "RPL31"       "RPL37A-DT"   "RPL37A"      "RPL32"       "RPL15"      
##  [16] "RPSA"        "RPL14"       "RPL29"       "RPL24"       "RPL22L1"    
##  [21] "RPL39L"      "RPL35A"      "RPL9"        "RPL34-DT"    "RPL34"      
##  [26] "RPS3A"       "RPL37"       "RPS23"       "RPS14"       "RPL26L1"    
##  [31] "RPS18"       "RPS10-NUDT3" "RPS10"       "RPL10A"      "RPL7L1"     
##  [36] "RPS12"       "RPS6KA2"     "RPS20"       "RPL7"        "RPL30"      
##  [41] "RPL8"        "RPS6"        "RPL35"       "RPL12"       "RPL7A"      
##  [46] "RPS24"       "RPLP2"       "RPL27A"      "RPS13"       "RPS6KA4"    
##  [51] "RPS6KB2"     "RPS6KB2-AS1" "RPS3"        "RPS25"       "RPS26"      
##  [56] "RPL41"       "RPL6"        "RPLP0"       "RPL21"       "RPS29"      
##  [61] "RPL36AL"     "RPS6KL1"     "RPS6KA5"     "RPS27L"      "RPL4"       
##  [66] "RPLP1"       "RPS17"       "RPL3L"       "RPS2"        "RPS15A"     
##  [71] "RPL13"       "RPL26"       "RPL23A"      "RPL23"       "RPL19"      
##  [76] "RPL27"       "RPS6KB1"     "RPL38"       "RPL17"       "RPS15"      
##  [81] "RPL36"       "RPS28"       "RPL18A"      "RPS16"       "RPS19"      
##  [86] "RPL18"       "RPL13A"      "RPS11"       "RPS9"        "RPL28"      
##  [91] "RPS5"        "RPS21"       "RPL3"        "RPS19BP1"    "RPS6KA3"    
##  [96] "RPS4X"       "RPS6KA6"     "RPL36A"      "RPL39"       "RPL10"      
## [101] "RPS4Y1"      "RPS6KA2-IT1"
#ratio of hemoglobin genes
hswound.com[["percent.hb"]] <- PercentageFeatureSet(object = hswound.com, pattern = "^HB[^(PES)]")
rownames(hswound.com)[grep("^HB[^(PES)]", rownames(hswound.com))]
## [1] "HBG2" "HBZ"  "HBA2" "HBA1" "HBD"  "HBQ1" "HBB"  "HBM"
rownames(hswound.com)[grep("^HB", rownames(hswound.com))]
##  [1] "HBEGF" "HBS1L" "HBP1"  "HBG2"  "HBZ"   "HBA2"  "HBA1"  "HBD"   "HBQ1" 
## [10] "HBB"   "HBM"
#ratio of MALAT1 genes
hswound.com[["percent.malat1"]] <- PercentageFeatureSet(object = hswound.com, pattern = "MALAT1")

sd(hswound.com@meta.data$nFeature_RNA)
## [1] 1765.312
d <- density(hswound.com@meta.data$nFeature_RNA)
{plot(d)
polygon(d, col="red", border="blue")}

table(hswound.com@meta.data$nFeature_RNA > 500)
## 
## FALSE  TRUE 
##  1475 62581
summary(hswound.com@meta.data$nCount_RNA)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     499   10091   19094   23702   31238  294135
d <- density(hswound.com@meta.data$nCount_RNA)
{plot(d)
polygon(d, col="red", border="blue")}

table(hswound.com@meta.data$nCount_RNA > 1000)
## 
## FALSE  TRUE 
##  1162 62894
table(hswound.com@meta.data$nCount_RNA > 1000 & hswound.com@meta.data$nCount_RNA < 150000)
## 
## FALSE  TRUE 
##  1224 62832
summary(hswound.com@meta.data$percent.mt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.411   4.936   6.350   6.958  96.179
d <- density(hswound.com@meta.data$percent.mt)
{plot(d)
polygon(d, col="red", border="blue")}

table(hswound.com@meta.data$percent.mt < 20)
## 
## FALSE  TRUE 
##  2144 61912
table(hswound.com@meta.data$percent.mt < 15)
## 
## FALSE  TRUE 
##  3213 60843
table(hswound.com@meta.data$percent.mt < 10)
## 
## FALSE  TRUE 
##  6605 57451
summary(hswound.com@meta.data$percent.ribo)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3565 19.2347 27.3616 27.0153 35.3310 79.5486
summary(hswound.com@meta.data$percent.hb)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.0028  0.0000 79.3182

2.2 QC plot before filtering

VlnPlot(
  hswound.com,
  features = c("nFeature_RNA", "nCount_RNA", "percent.mt", "percent.ribo", "percent.hb"),
  ncol = 2, pt.size=0.0001)

plot1 <- FeatureScatter(hswound.com, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by = "orig.ident") + RotatedAxis()
plot2 <- FeatureScatter(hswound.com, feature1 = "nCount_RNA", feature2 = "percent.mt", group.by = "orig.ident") + RotatedAxis()
plot3 <- FeatureScatter(hswound.com, feature1 = "nCount_RNA", feature2 = "percent.ribo", group.by = "orig.ident") + RotatedAxis()
plot4 <- FeatureScatter(hswound.com, feature1 = "nCount_RNA", feature2 = "percent.hb", group.by = "orig.ident") + RotatedAxis()
plot1 + plot2 + plot3 + plot4 + plot_layout(ncol = 2, guides = 'collect')

plot5 <- FeatureScatter(hswound.com, feature1 = "percent.mt", feature2 = "percent.malat1", group.by = "orig.ident") + RotatedAxis()
plot6 <- FeatureScatter(hswound.com, feature1 = "nCount_RNA", feature2 = "percent.malat1", group.by = "orig.ident") + RotatedAxis()
plot7 <- FeatureScatter(hswound.com, feature1 = "nFeature_RNA", feature2 = "percent.malat1", group.by = "orig.ident") + RotatedAxis()
plot8 <- FeatureScatter(hswound.com, feature1 = "nFeature_RNA", feature2 = "percent.mt", group.by = "orig.ident") + RotatedAxis()
plot5 + plot6 + plot7 + plot8 + plot_layout(ncol = 2, guides = 'collect')

rm(list=ls());gc()
##           used  (Mb) gc trigger   (Mb) limit (Mb)  max used   (Mb)
## Ncells 3077543 164.4    4899647  261.7         NA   4899647  261.7
## Vcells 9537344  72.8  825932025 6301.4      32768 764259286 5830.9
####----Compute the top relative expression of each gene per cell Use sparse matrix----####
par(mar = c(4, 8, 2, 1))
C <- hswound.com@assays$RNA@counts
C <- Matrix::t(Matrix::t(C)/Matrix::colSums(C)) * 100
most_expressed <- order(apply(C, 1, median), decreasing = T)[20:1]
boxplot(t(as.matrix(C[most_expressed, ])), cex = 0.1, las = 1, xlab = "% total count per cell", 
    col = (scales::hue_pal())(20)[20:1], horizontal = TRUE)

2.3 filter QC

####----Fitler gene expressed in at least 10 cells (>10) -> some uninterested gene hemoglobin genes -> MALAT1 gene -> MT genes----####
length(rownames(hswound.com))
table(hswound.com$orig.ident) #sample number
selected_f <- rownames(hswound.com)[Matrix::rowSums(hswound.com@assays$RNA@counts > 0) > 10]; length(selected_f)
selected_f <- selected_f[-c(grep("^HB[^(PES)]", selected_f))]; length(selected_f)
selected_f <- selected_f[-grep("MALAT1", selected_f)]; length(selected_f)
selected_f <- selected_f[-grep("^MT-", selected_f)]; length(selected_f)
hswound.com #Check first
hswound.com <- subset(hswound.com, features = selected_f)
hswound.com #Check if it works
identical(selected_f, rownames(hswound.com))
hswound.com <- subset(hswound.com, subset = nFeature_RNA > 500 & percent.mt < 20 & nCount_RNA > 1000 & nCount_RNA < 150000)
#after filtering
hswound.com
table(hswound.com$orig.ident)
saveRDS(hswound.com, "../00_Origin_objects/s0_original_seurat_object_noDoublets_afterqc.rds")

Cell numbers for each sample after quality control

## An object of class Seurat 
## 25778 features across 60450 samples within 1 assay 
## Active assay: RNA (25778 features, 0 variable features)
## 
##  PWH26D0  PWH26D1 PWH26D30  PWH26D7  PWH27D0  PWH27D1 PWH27D30  PWH27D7 
##     4306     4787     3470     4603     3319     6635     6072     6764 
##  PWH28D0  PWH28D1 PWH28D30  PWH28D7 
##     4175     4676     6002     5641

2.4 QC plot after filtering

VlnPlot(
  hswound.com,
  features = c("nFeature_RNA", "nCount_RNA", "percent.mt", "percent.ribo"),
  ncol = 2, pt.size=0)

plot1 <- FeatureScatter(hswound.com, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by = "orig.ident") + RotatedAxis()
plot2 <- FeatureScatter(hswound.com, feature1 = "nCount_RNA", feature2 = "percent.mt", group.by = "orig.ident") + RotatedAxis()
plot3 <- FeatureScatter(hswound.com, feature1 = "nCount_RNA", feature2 = "percent.ribo", group.by = "orig.ident") + RotatedAxis()
plot4 <- FeatureScatter(hswound.com, feature1 = "nCount_RNA", feature2 = "percent.hb", group.by = "orig.ident") + RotatedAxis()
plot1 + plot2 + plot3 + plot4 + plot_layout(ncol = 2, guides = 'collect')

plot5 <- FeatureScatter(hswound.com, feature1 = "percent.mt", feature2 = "percent.malat1", group.by = "orig.ident") + RotatedAxis()
plot6 <- FeatureScatter(hswound.com, feature1 = "nCount_RNA", feature2 = "percent.malat1", group.by = "orig.ident") + RotatedAxis()
plot7 <- FeatureScatter(hswound.com, feature1 = "nFeature_RNA", feature2 = "percent.malat1", group.by = "orig.ident") + RotatedAxis()
plot8 <- FeatureScatter(hswound.com, feature1 = "nFeature_RNA", feature2 = "percent.mt", group.by = "orig.ident") + RotatedAxis()
plot5 + plot6 + plot7 + plot8 + plot_layout(ncol = 2, guides = 'collect')

rm(list=ls());gc()
##           used  (Mb) gc trigger   (Mb) limit (Mb)  max used   (Mb)
## Ncells 3075011 164.3    4899647  261.7         NA   4899647  261.7
## Vcells 9300510  71.0  792958744 6049.8      32768 764259286 5830.9

Step 3: Normalization and feature selection

3.1 normalization using sctransform for each sample

hswound.list <- SplitObject(hswound.com, split.by = "orig.ident")
####----Do normalization for each dataset----####
####----SCTransform normalization----#### These steps were done in Uppmax
####----SCTransform (default) already normalize the nUMI and nGene count----####
####----Cell cycle: removing all signal associated with cell cycle can negatively impact downstream analysis, particularly in differentiating processes (like murine hematopoiesis), where stem cells are quiescent and differentiated cells are proliferating: regressing out the difference between the G2M and S phase scores. This means that signals separating non-cycling cells and cycling cells will be maintained, but differences in cell cycle phase among proliferating cells (which are often uninteresting) will be regressed out of the data----####
hswound.list <- lapply(X = hswound.list, FUN = function(x) {
  x <- NormalizeData(x)
  x <- CellCycleScoring(object = x, g2m.features = cc.genes$g2m.genes, s.features = cc.genes$s.genes, set.ident = FALSE)
  x$CC.Difference <- x$S.Score - x$G2M.Score
  x <- SCTransform(x, vars.to.regress = c("percent.mt", "CC.Difference"), variable.features.n = 4000, verbose = TRUE, return.only.var.genes = FALSE, method = "glmGamPoi")
  x <- RunPCA(x, npcs = 60, ndims.print = 1:5, nfeatures.print = 5, features = VariableFeatures(object = x))

})
rm(hswound.com);gc() #delete the object to save RAM memory
####----Save the individual seurat object----####
for (i in 1:length(hswound.list)) {
  saveRDS(hswound.list[[i]], file = paste0("s1_cleanSeurat_",names(hswound.list[i]), ".rds"))
}

Step 4: Batch-correction

4.1 Load all the SCT-normalized data

samples <- list.files("../s0_SeuratObject_SplitPerSample/", pattern = ".rds$")
sm_folders <- paste0("../s0_SeuratObject_SplitPerSample/", samples)
samplenames <- gsub("s1_cleanSeurat_", "", samples)
samplenames <- gsub("\\.rds", "", samplenames)
hswound.list <- list()
for (i in seq_along(samples)) {
  tmp_sms <- samplenames[i]
  hswound.list[[tmp_sms]] <- readRDS(sm_folders[i])
}
hswound.list
## $PWH26D0
## An object of class Seurat 
## 47643 features across 4306 samples within 2 assays 
## Active assay: SCT (21865 features, 4000 variable features)
##  1 other assay present: RNA
##  1 dimensional reduction calculated: pca
## 
## $PWH26D1
## An object of class Seurat 
## 47277 features across 4787 samples within 2 assays 
## Active assay: SCT (21499 features, 4000 variable features)
##  1 other assay present: RNA
##  1 dimensional reduction calculated: pca
## 
## $PWH26D30
## An object of class Seurat 
## 47849 features across 3470 samples within 2 assays 
## Active assay: SCT (22071 features, 4000 variable features)
##  1 other assay present: RNA
##  1 dimensional reduction calculated: pca
## 
## $PWH26D7
## An object of class Seurat 
## 49028 features across 4603 samples within 2 assays 
## Active assay: SCT (23250 features, 4000 variable features)
##  1 other assay present: RNA
##  1 dimensional reduction calculated: pca
## 
## $PWH27D0
## An object of class Seurat 
## 47020 features across 3319 samples within 2 assays 
## Active assay: SCT (21242 features, 4000 variable features)
##  1 other assay present: RNA
##  1 dimensional reduction calculated: pca
## 
## $PWH27D1
## An object of class Seurat 
## 47353 features across 6635 samples within 2 assays 
## Active assay: SCT (21575 features, 4000 variable features)
##  1 other assay present: RNA
##  1 dimensional reduction calculated: pca
## 
## $PWH27D30
## An object of class Seurat 
## 48220 features across 6072 samples within 2 assays 
## Active assay: SCT (22442 features, 4000 variable features)
##  1 other assay present: RNA
##  1 dimensional reduction calculated: pca
## 
## $PWH27D7
## An object of class Seurat 
## 49056 features across 6764 samples within 2 assays 
## Active assay: SCT (23278 features, 4000 variable features)
##  1 other assay present: RNA
##  1 dimensional reduction calculated: pca
## 
## $PWH28D0
## An object of class Seurat 
## 47676 features across 4175 samples within 2 assays 
## Active assay: SCT (21898 features, 4000 variable features)
##  1 other assay present: RNA
##  1 dimensional reduction calculated: pca
## 
## $PWH28D1
## An object of class Seurat 
## 47013 features across 4676 samples within 2 assays 
## Active assay: SCT (21235 features, 4000 variable features)
##  1 other assay present: RNA
##  1 dimensional reduction calculated: pca
## 
## $PWH28D30
## An object of class Seurat 
## 48189 features across 6002 samples within 2 assays 
## Active assay: SCT (22411 features, 4000 variable features)
##  1 other assay present: RNA
##  1 dimensional reduction calculated: pca
## 
## $PWH28D7
## An object of class Seurat 
## 48816 features across 5641 samples within 2 assays 
## Active assay: SCT (23038 features, 4000 variable features)
##  1 other assay present: RNA
##  1 dimensional reduction calculated: pca

Something need to think about it. (Variable genes across samples/conditions)

####----Check variable genes for each sample----####
vg2 <- lapply(hswound.list, function(x) VariableFeatures(x)[1:3000])
vg.all <- unique(unlist(vg2)) #unique top 3k variable genes among all samples
        
rn <- lapply(hswound.list, rownames) #total number of genes expressed in each sample
nE <- colSums(Reduce(rbind, lapply(rn, function(x) vg.all %in% x))) #Check the overlapping number of variable genes across samples
vg.all <- vg.all[nE == length(hswound.list)] #length(hswound.list) the number of samples
setdiff(features.all, vg.all) #features.all is from SelectIntegrationFeatures function directly (below). PS: no big difference if I didn't do any further filtering even though genes not expressed across all samples

4.2 Integrate all time points

####----Integration with SCTransform----####
####----First select top variable genes----####
features.all <- SelectIntegrationFeatures(object.list = hswound.list, nfeatures = 4500) #First select 4.5K genes
rn <- lapply(hswound.list, rownames) 
nE <- colSums(Reduce(rbind, lapply(rn, function(x) features.all %in% x))) 
features.all <- features.all[nE == length(hswound.list)][1:4000] #Keep only 4K variable genes expressed in all samples

eachvg <- list(
  PWH26D0=VariableFeatures(hswound.list$PWH26D0), 
  PWH26D1=VariableFeatures(hswound.list$PWH26D1),
  PWH26D30=VariableFeatures(hswound.list$PWH26D30),
  PWH26D7=VariableFeatures(hswound.list$PWH26D7),
  PWH27D0=VariableFeatures(hswound.list$PWH27D0),
  PWH27D1=VariableFeatures(hswound.list$PWH27D1),
  PWH27D30=VariableFeatures(hswound.list$PWH27D30),
  PWH27D7=VariableFeatures(hswound.list$PWH27D7),
  PWH28D0=VariableFeatures(hswound.list$PWH28D0),
  PWH28D1=VariableFeatures(hswound.list$PWH28D1),
  PWH28D30=VariableFeatures(hswound.list$PWH28D30),
  PWH28D7=VariableFeatures(hswound.list$PWH28D7),
  Allvg=features.all)
source("../Functions/overlap_phyper2.R")
overlap_phyper2(eachvg, eachvg)

## $P
##                PWH26D0       PWH26D1      PWH26D30 PWH26D7       PWH27D0
## PWH26D0   0.000000e+00  0.000000e+00  0.000000e+00       0  0.000000e+00
## PWH26D1   0.000000e+00  0.000000e+00  0.000000e+00       0  0.000000e+00
## PWH26D30  0.000000e+00  0.000000e+00  0.000000e+00       0  0.000000e+00
## PWH26D7   0.000000e+00  0.000000e+00  0.000000e+00       0  0.000000e+00
## PWH27D0   0.000000e+00  0.000000e+00  0.000000e+00       0  0.000000e+00
## PWH27D1   0.000000e+00  0.000000e+00  0.000000e+00       0  0.000000e+00
## PWH27D30  0.000000e+00  0.000000e+00  0.000000e+00       0  0.000000e+00
## PWH27D7  5.486432e-268 3.921443e-309  0.000000e+00       0  0.000000e+00
## PWH28D0   0.000000e+00  0.000000e+00  0.000000e+00       0  0.000000e+00
## PWH28D1   0.000000e+00  0.000000e+00 1.996025e-321       0  0.000000e+00
## PWH28D30 8.199847e-304 1.806570e-264  0.000000e+00       0 1.435086e-304
## PWH28D7   0.000000e+00  0.000000e+00  0.000000e+00       0  0.000000e+00
## Allvg     0.000000e+00  0.000000e+00  0.000000e+00       0  0.000000e+00
##                     NA            NA            NA      NA            NA
##                PWH27D1      PWH27D30       PWH27D7       PWH28D0       PWH28D1
## PWH26D0   0.000000e+00  0.000000e+00 5.486432e-268  0.000000e+00  0.000000e+00
## PWH26D1   0.000000e+00  0.000000e+00 3.921443e-309  0.000000e+00  0.000000e+00
## PWH26D30  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00 1.996025e-321
## PWH26D7   0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00
## PWH27D0   0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00
## PWH27D1   0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00
## PWH27D30  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00 1.984499e-311
## PWH27D7   0.000000e+00  0.000000e+00  0.000000e+00 2.505588e-305 2.657925e-302
## PWH28D0   0.000000e+00  0.000000e+00 2.505588e-305  0.000000e+00  0.000000e+00
## PWH28D1   0.000000e+00 1.984499e-311 2.657925e-302  0.000000e+00  0.000000e+00
## PWH28D30 3.178838e-243  0.000000e+00  0.000000e+00  0.000000e+00 3.079424e-245
## PWH28D7   0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00
## Allvg     0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00
##                     NA            NA            NA            NA            NA
##               PWH28D30 PWH28D7 Allvg   
## PWH26D0  8.199847e-304       0     0 NA
## PWH26D1  1.806570e-264       0     0 NA
## PWH26D30  0.000000e+00       0     0 NA
## PWH26D7   0.000000e+00       0     0 NA
## PWH27D0  1.435086e-304       0     0 NA
## PWH27D1  3.178838e-243       0     0 NA
## PWH27D30  0.000000e+00       0     0 NA
## PWH27D7   0.000000e+00       0     0 NA
## PWH28D0   0.000000e+00       0     0 NA
## PWH28D1  3.079424e-245       0     0 NA
## PWH28D30  0.000000e+00       0     0 NA
## PWH28D7   0.000000e+00       0     0 NA
## Allvg     0.000000e+00       0     0 NA
##                     NA      NA    NA NA
## 
## $M
##          PWH26D0 PWH26D1 PWH26D30 PWH26D7 PWH27D0 PWH27D1 PWH27D30 PWH27D7
## PWH26D0     4000    2982     2964    2912    3260    2916     2854    2767
## PWH26D1     2982    4000     2918    2933    3024    3343     2864    2823
## PWH26D30    2964    2918     4000    3097    2937    2861     3172    2995
## PWH26D7     2912    2933     3097    4000    2913    2914     3016    3209
## PWH27D0     3260    3024     2937    2913    4000    3055     2939    2862
## PWH27D1     2916    3343     2861    2914    3055    4000     2874    2886
## PWH27D30    2854    2864     3172    3016    2939    2874     4000    3040
## PWH27D7     2767    2823     2995    3209    2862    2886     3040    4000
## PWH28D0     3214    2923     2966    2889    3197    2918     2894    2818
## PWH28D1     2900    3344     2839    2869    2981    3347     2826    2814
## PWH28D30    2816    2762     3195    2979    2817    2731     3190    2983
## PWH28D7     2904    2952     3067    3198    2919    2998     3016    3178
## Allvg       3229    3288     3209    3191    3318    3279     3193    3117
## nTotal2     4000    4000     4000    4000    4000    4000     4000    4000
##          PWH28D0 PWH28D1 PWH28D30 PWH28D7 Allvg nTotal1
## PWH26D0     3214    2900     2816    2904  3229    4000
## PWH26D1     2923    3344     2762    2952  3288    4000
## PWH26D30    2966    2839     3195    3067  3209    4000
## PWH26D7     2889    2869     2979    3198  3191    4000
## PWH27D0     3197    2981     2817    2919  3318    4000
## PWH27D1     2918    3347     2731    2998  3279    4000
## PWH27D30    2894    2826     3190    3016  3193    4000
## PWH27D7     2818    2814     2983    3178  3117    4000
## PWH28D0     4000    2913     2880    2959  3217    4000
## PWH28D1     2913    4000     2734    2944  3251    4000
## PWH28D30    2880    2734     4000    3032  3103    4000
## PWH28D7     2959    2944     3032    4000  3202    4000
## Allvg       3217    3251     3103    3202  4000    4000
## nTotal2     4000    4000     4000    4000  4000    8025
## 
## $plot
####----Prepare the integration----####
hswound.list <- PrepSCTIntegration(object.list = hswound.list, anchor.features = features.all)

hswound.combined.sct <- merge(hswound.list[[1]], 
                         y = hswound.list[2:length(hswound.list)],
                         project = "humanwound",
                         merge.data = TRUE)
hswound.combined.sct
##           used  (Mb) gc trigger    (Mb) limit (Mb)   max used    (Mb)
## Ncells 3005966 160.6    4899647   261.7         NA    4899647   261.7
## Vcells 5952144  45.5 3145689968 23999.8      32768 2889092333 22042.1
VariableFeatures(hswound.combined.sct) <- features.all
hswound.combined.sct <- RunPCA(object = hswound.combined.sct, assay = "SCT", npcs = 60, features = VariableFeatures(hswound.combined.sct))
table(hswound.combined.sct$orig.ident)
## 
##  PWH26D0  PWH26D1  PWH26D7 PWH26D30  PWH27D0  PWH27D1  PWH27D7 PWH27D30 
##     4306     4787     4603     3470     3319     6635     6764     6072 
##  PWH28D0  PWH28D1  PWH28D7 PWH28D30 
##     4175     4676     5641     6002
ElbowPlot(hswound.combined.sct, ndims = 60, reduction = "pca")

####----Check the genes contributing to top PCs----####
#If there is any gene related to the ribosomal genes (^RP[S|L]),
VizDimLoadings(hswound.combined.sct, dims = 1:12, reduction = "pca")

hswound.combined.sct <- RunHarmony(object = hswound.combined.sct,
                                    assay.use = "SCT",
                                    reduction = "pca",
                                    dims.use = 1:40,
                                    group.by.vars = "orig.ident",
                                    plot_convergence = TRUE)

#rn pca, umap, clustering
hswound.combined.sct <- RunUMAP(object = hswound.combined.sct, assay = "SCT", reduction = "harmony", dims = 1:40)
hswound.combined.sct <- FindNeighbors(object = hswound.combined.sct, assay = "SCT", reduction = "harmony", dims = 1:40, k.param = 40)
hswound.combined.sct <- FindClusters(object = hswound.combined.sct, resolution = 0.5)
hswound.combined.sct <- FindClusters(object = hswound.combined.sct, resolution = 0.8)
hswound.combined.sct$orig.ident <- factor(x = hswound.combined.sct$orig.ident, levels =
                                            c("PWH26D0", "PWH26D1", "PWH26D7","PWH26D30",
                                              "PWH27D0", "PWH27D1", "PWH27D7","PWH27D30",
                                              "PWH28D0", "PWH28D1", "PWH28D7","PWH28D30"))
hswound.combined.sct$Condition <- factor(x = hswound.combined.sct$Condition, levels =
                                            c("Skin", "Wound1", "Wound7","Wound30"))
saveRDS(hswound.combined.sct, file = "s1_cleanSeurat_harmony_allSamples_clusters.rds")

4.3 UMAP plots of Clusters, Sampled, Group types, Gender

DimPlot(hswound.combined.sct, reduction = "umap", group.by='seurat_clusters', label=TRUE) +
  ggtitle('Cluster')

DimPlot(hswound.combined.sct, reduction = "umap", group.by='orig.ident', label=FALSE) +
  ggtitle('Samples')

DimPlot(hswound.combined.sct, reduction = "umap", group.by = "seurat_clusters", pt.size = .001, split.by = 'orig.ident', ncol = 4, label = TRUE) + NoLegend()

DimPlot(hswound.combined.sct, reduction = "umap", group.by='Condition', label=FALSE) +
  ggtitle('Groups')

DimPlot(hswound.combined.sct, reduction = "umap", group.by = "seurat_clusters", pt.size = .001, split.by = 'Condition', ncol = 4, label = TRUE) + NoLegend()

DimPlot(hswound.combined.sct, reduction = "umap", group.by='Age', label=FALSE) +
  ggtitle('Age')

DimPlot(hswound.combined.sct, reduction = "umap", group.by='Gender', label=FALSE) +
  ggtitle('Gender')

# Explore whether clusters segregate by cell cycle phase
DimPlot(hswound.combined.sct,
        label = TRUE,
        split.by = "Phase")  + NoLegend()

(DimPlot(hswound.combined.sct, reduction = "umap", group.by='seurat_clusters', label=TRUE) + ggtitle('Clusters') + NoAxes() + NoLegend()) +
  (FeaturePlot(hswound.combined.sct, features = c("MKI67"), reduction = "umap", cols = c("gray","red")) + NoLegend() + NoAxes())

# Determine metrics to plot present in hswound.combined.sct@meta.data
metrics <-  c("nCount_RNA", "nFeature_RNA", "S.Score", "G2M.Score", "percent.mt")

FeaturePlot(hswound.combined.sct,
            reduction = "umap",
            features = metrics,
            pt.size = 0.001,
            label = FALSE)

4.4 stacked barplot showing the proportion of cells from each batch in each cluster

####----Please pay attentions to the original numbers of cells per sample----####
clusters <- unique(hswound.combined.sct@meta.data[["seurat_clusters"]])
clusters <- clusters[order(clusters)]
df <- data.frame()
for(i in 1:length(clusters)){
  SmCell_sum <- table(hswound.combined.sct$orig.ident)
  tmp.df1 <- hswound.combined.sct@meta.data %>% subset(seurat_clusters == clusters[i]) %>% select(orig.ident) %>% table()
  if(length(tmp.df1) == 12){
    #First normalize the total cell counts per sample
    cur_df <- as.data.frame(tmp.df1 / SmCell_sum)
    colnames(cur_df) <- c("Sample", "Freq")
    #Calculate the normalized proportion 
    cur_df$Freq <- cur_df$Freq * 1/(sum(cur_df$Freq))
    cur_df$Cluster <- clusters[i]
    df <- rbind(df, cur_df)
  } else {
    #print(i);print(tmp.df1)
    #only include the matched samples
    match.sample <- SmCell_sum[names(SmCell_sum) %in% names(tmp.df1)]
    #print(match.sample)
    #First normalize the total cell counts per sample
    cur_df <- as.data.frame(tmp.df1 / match.sample)
    colnames(cur_df) <- c("Sample", "Freq")
    #Calculate the normalized proportion 
    cur_df$Freq <- cur_df$Freq * 1/(sum(cur_df$Freq))
    cur_df$Cluster <- clusters[i]
    df <- rbind(df, cur_df)
  }
}

##Plot for each sample
ggplot(df, aes(x = Cluster, y = Freq, fill = Sample)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_brewer(palette = "Paired") + # Paired or Set3 maximum variable = 12
  xlab('Clsuter') +
  scale_y_continuous(breaks = seq(0, 1, 0.1),
                     expand = c(0, 0),
                     name = 'Percentage') +
  theme_bw() +
  theme(
    panel.grid.major.x = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank(),
    strip.background = element_blank(),
    #panel.border = element_rect(size = 0.35),
    axis.ticks = element_blank()#,
    #axis.text.x = element_text(size = 5)
  ) +
  coord_flip()

##Plot for each group
df.group <- df %>% mutate(Sample = gsub("PWH..", "", Sample)) %>% group_by(Cluster, Sample) %>% 
  summarise(Freq = sum(Freq))
## `summarise()` has grouped output by 'Cluster'. You can override using the `.groups` argument.
df.group$Sample <- factor(df.group$Sample, levels = c("D0", "D1", "D7", "D30"))
ggplot(df.group, aes(x = Cluster, y = Freq, fill = Sample)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_brewer(palette = "Set1") + # Paired or Set3 maximum variable = 12
  xlab('Clsuter') +
  scale_y_continuous(breaks = seq(0, 1, 0.1),
                     expand = c(0, 0),
                     name = 'Percentage') +
  theme_bw() +
  theme(
    panel.grid.major.x = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank(),
    strip.background = element_blank(),
    #panel.border = element_rect(size = 0.35),
    axis.ticks = element_blank()#,
    #axis.text.x = element_text(size = 5)
  ) +
  coord_flip()

4.5 Clusters determined by resolutions

# Clustering with louvain (algorithm 1) using different resolutions
for (res in c(0.2, 0.4, 0.5, 0.6, 0.8, 1.0, 1.2, 1.5, 2)) {
    hswound.combined.sct <- FindClusters(hswound.combined.sct, resolution = res, verbose = FALSE)
}

plot_grid(ncol = 2, 
          DimPlot(hswound.combined.sct, reduction = "umap", group.by = "SCT_snn_res.0.2") +
            ggtitle("louvain_0.2"), 
          DimPlot(hswound.combined.sct, reduction = "umap", group.by = "SCT_snn_res.0.4") + 
            ggtitle("louvain_0.4"), 
          DimPlot(hswound.combined.sct, reduction = "umap", group.by = "SCT_snn_res.0.5") + 
            ggtitle("louvain_0.5"),
          DimPlot(hswound.combined.sct, reduction = "umap", group.by = "SCT_snn_res.0.6") +
            ggtitle("louvain_0.6"), 
          DimPlot(hswound.combined.sct, reduction = "umap", group.by = "SCT_snn_res.0.8") + 
            ggtitle("louvain_0.8"), 
          DimPlot(hswound.combined.sct, reduction = "umap", group.by = "SCT_snn_res.1") + 
            ggtitle("louvain_1"),
          DimPlot(hswound.combined.sct, reduction = "umap", group.by = "SCT_snn_res.1.2") +
            ggtitle("louvain_1.2"), 
          DimPlot(hswound.combined.sct, reduction = "umap", group.by = "SCT_snn_res.1.5") + 
            ggtitle("louvain_1.5"), 
          DimPlot(hswound.combined.sct, reduction = "umap", group.by = "SCT_snn_res.2") + 
            ggtitle("louvain_2"))

4.6 Cluster tree plot

# install.packages('clustree')
suppressPackageStartupMessages(library(clustree))

clustree(hswound.combined.sct@meta.data, prefix = "SCT_snn_res.")
## Warning: The `add` argument of `group_by()` is deprecated as of dplyr 1.0.0.
## Please use the `.add` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

##           used  (Mb) gc trigger    (Mb) limit (Mb)   max used    (Mb)
## Ncells 3215233 171.8    9023238   481.9         NA    9023238   481.9
## Vcells 8409359  64.2 2516551975 19199.8      32768 3145681718 23999.7

Step 5. DE analysis using original RNA assay

5.1 FindAllMarkers

####----Change to RNA assay to do the DE analysis----####
DefaultAssay(hswound.combined.sct) <- "RNA" #DE for all genes instead of only using variable gene
hswound.combined.sct

hswound.combined.sct <- NormalizeData(hswound.combined.sct, verbose = TRUE,  normalization.method = "LogNormalize", scale.factor = 10000)

####----Calculate the scale data for following heatmap visualization of marker genes----####
hswound.combined.sct <- ScaleData(object = hswound.combined.sct, features = rownames(hswound.combined.sct), vars.to.regress = c("percent.mt", "CC.Difference", "nCount_RNA", "nFeature_RNA"))
#save data
saveRDS(hswound.combined.sct, "s2_cleanSeurat_harmony_allSamples_clusters_normGenes.rds") #save normalized gene expressions

Idents(hswound.combined.sct) <- "SCT_snn_res.0.8" 
table(hswound.combined.sct$SCT_snn_res.0.8)
####----FindAllMarkers for all clusters----####
markers <- FindAllMarkers(
  hswound.combined.sct,
  only.pos = TRUE,
  min.pct = 0.25,  #min.pct = 0.5, logfc.threshold = 0.5,
  logfc.threshold = 0.25,
  test.use = "MAST") #MAST has good FDR control and is faster than DESeq2, default test is wilcox

####----add the gene description to the cluster marker genes----####
require(biomaRt) #listMarts() : Check the version and dataset
hs.ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl") #listAttributes(mart = hs.ensembl)
annot <- getBM(attributes = c("external_gene_name", "entrezgene_id", "gene_biotype","description"), mart=hs.ensembl) %>% 
  distinct(external_gene_name, .keep_all = TRUE) %>% tidyr::separate(description, c("Gene_anno", "Other"), sep="\\[Source") %>% 
  dplyr::select(-5)

annot <- readRDS("../Functions/humanAnnotation.rds")
####----add the info and reorder the columns----####
markers <- markers %>% left_join(., annot, by=c("gene" = "external_gene_name")) %>% 
  dplyr::select(6, 7, 2:4, 1, 5, everything()) %>% arrange(cluster, desc(avg_log2FC))
data.table::fwrite(markers, file="s2_cleanSeurat_harmony_allSamples_markerGene_ori.txt", sep="\t")
hswound.combined.sct <- readRDS(file = "s2_cleanSeurat_harmony_allSamples_clusters_normGenes.rds") #reload the clustering results with normalized expressions for further visualization
hswound.combined.sct
## An object of class Seurat 
## 51179 features across 60450 samples within 2 assays 
## Active assay: RNA (25778 features, 0 variable features)
##  1 other assay present: SCT
##  3 dimensional reductions calculated: pca, harmony, umap
variable.genes <- VariableFeatures(hswound.combined.sct, assay="SCT") %>% as.data.frame() %>% rename("gene" = ".") %>% 
  mutate(VariableGene = "Yes")
markers <- data.table::fread("s2_cleanSeurat_harmony_allSamples_markerGene_ori.txt") #marker genes
####----Add the information about if the gene is the variable gene----####
markers <- markers %>% left_join(., variable.genes, by=c("gene" = "gene"))

5.2 Draw marker plots (Heatmap and Violin plot)

top5 <- markers %>% group_by(cluster) %>% top_n(n = 5, wt = avg_log2FC)
####----Heatmap plot----####
DoHeatmap(hswound.combined.sct, features = as.character(unique(top5$gene)), group.by='SCT_snn_res.0.8', label=TRUE, size = 2, angle = 30, disp.min = -2.5)

####----Violin plot----####
top1 <- markers %>% group_by(cluster) %>% top_n(n = 1, wt = avg_log2FC)
VlnPlot(hswound.combined.sct, features = top1$gene, group.by='SCT_snn_res.0.8', ncol = 2, pt.size = 0) 

5.3 Check known marker genes

# set up list of canonical cell type markers
canonical_markers <- list(
  'Keratinocytes' = c('KRT5', 'KRT14', 'KRT1', 'KRT10'),
  'Fibroblasts' = c('COL1A1', 'COL1A2', 'DCN'),
  'Melanocyte' =    c('PMEL',   'MLANA'),
  'Endothelial-cells' = c('PECAM1', 'VWF'),
  'T-cells' = c('CD52', 'CD3D'),
  'Dendritic-cells' = c('CD74')
)
plot_list <- FeaturePlot(
  hswound.combined.sct,
  features=unlist(canonical_markers),
  combine=FALSE, cols = c("gray","red")
)

# apply theme to each feature plot
for(i in 1:length(plot_list)){
  plot_list[[i]] <- plot_list[[i]] + NoLegend() + NoAxes()
}

cowplot::plot_grid(plotlist = plot_list, ncol = 4)

rm(plot_list);gc(verbose = FALSE)
##              used    (Mb) gc trigger    (Mb) limit (Mb)   max used    (Mb)
## Ncells    3351975   179.1    9023238   481.9         NA    9023238   481.9
## Vcells 3216676581 24541.3 3820091770 29145.0      32768 3820091770 29145.0

5.4 Annotate clusters by using R packages

library(SingleR)
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:purrr':
## 
##     reduce
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## 
## Attaching package: 'SummarizedExperiment'
## The following object is masked from 'package:SeuratObject':
## 
##     Assays
## The following object is masked from 'package:Seurat':
## 
##     Assays
library(celldex)
## 
## Attaching package: 'celldex'
## The following objects are masked from 'package:SingleR':
## 
##     BlueprintEncodeData, DatabaseImmuneCellExpressionData,
##     HumanPrimaryCellAtlasData, ImmGenData, MonacoImmuneData,
##     MouseRNAseqData, NovershternHematopoieticData
hpca.se <- HumanPrimaryCellAtlasData()
## snapshotDate(): 2021-05-18
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
hpca.se$label.main <- str_replace_all(hpca.se$label.main, 
                                      c("NK_cell" = "NK cells", "B_cell" = "B cells", 
                                        "DC" = "Dendritic cells","HSC_-G-CSF" = "HSC G-CSF",
                                        "Monocyte" = "Monocytes", "Astrocyte" = "Astrocytes",
                                        "Erythroblast" = "Erythroblasts", "Macrophage" = "Macrophages",
                                        "BM" = "BMs", "MSC"="MSCs",
                                        "CMP" = "CMPs", "GMP" = "GMPs", 
                                        "MEP"="MEPs", "Myelocyte"="Myelocytes",
                                        "Neuroepithelial_cell"="Neuroepithelial cells", "T_cells" = "T cells", 
                                        "iPS_cells" = "iPS cells", "Endothelial_cells" = "Endothelial cells", 
                                        "Tissue_stem_cells" = "Tissue stem cells", "Embryonic_stem_cells" = "Embryonic stem cells",
                                        "Smooth_muscle_cells" = "Smooth muscle cells", "Epithelial_cells" = "Epithelial cells"))
hpca.se$label.main[hpca.se$label.main == "HSC_CD34+"] <-"HSC CD34"
hpca.se$label.main[hpca.se$label.main == "Pro-B cells_CD34+"] <-"Pro-B cells CD34 Pos"
hpca.se$label.main[hpca.se$label.main == "Pre-B cells_CD34-"] <-"Pre-B cells CD34 Neg"

####----Check the unique cell types----####
hpca.se$label.main %>% unique()
##  [1] "Dendritic cells"       "Smooth muscle cells"   "Epithelial cells"     
##  [4] "B cells"               "Neutrophils"           "T cells"              
##  [7] "Monocytes"             "Erythroblasts"         "BMs & Prog."          
## [10] "Endothelial cells"     "Gametocytes"           "Neurons"              
## [13] "Keratinocytes"         "HSC G-CSF"             "Macrophages"          
## [16] "NK cells"              "Embryonic stem cells"  "Tissue stem cells"    
## [19] "Chondrocytes"          "Osteoblasts"           "BMs"                  
## [22] "Platelets"             "Fibroblasts"           "iPS cells"            
## [25] "Hepatocytes"           "MSCs"                  "Neuroepithelial cells"
## [28] "Astrocytes"            "HSC CD34"              "CMPs"                 
## [31] "GMPs"                  "MEPs"                  "Myelocytes"           
## [34] "Pre-B cells CD34 Neg"  "Pro-B cells CD34 Pos"  "Pro-Myelocytes"
#Get the count data from Seurat Object
hswound.singleR <- GetAssayData(hswound.combined.sct)
##First check the main cell type
Idents(hswound.combined.sct) <- "SCT_snn_res.0.8" #Choose different resolutions to annotate
pred.hesc <- SingleR(test = hswound.singleR, ref = hpca.se, assay.type.test=1, labels = hpca.se$label.main, clusters = Idents(hswound.combined.sct))
##Plot of main cell type
plotScoreHeatmap(pred.hesc)

celltype.main <- as.data.frame(pred.hesc) %>% tibble::rownames_to_column(var = "Cluster")
celltype.main.score <- as.data.frame(pred.hesc) %>% t() %>% as.data.frame() %>% tibble::rownames_to_column(var = "Cluster") %>%  mutate(SingleR = "MainCellType")

celltype.main$Cluster <- parse_integer(celltype.main$Cluster)
markers <- markers %>% left_join(., celltype.main[, c(1,38)], by=c("cluster" = "Cluster")) #Choose the first.label

####----Check the unique fine cell types----####
hpca.se$label.fine %>% unique() #sub cell types [which represents the highest resolution]
##   [1] "DC:monocyte-derived:immature"                          
##   [2] "DC:monocyte-derived:Galectin-1"                        
##   [3] "DC:monocyte-derived:LPS"                               
##   [4] "DC:monocyte-derived"                                   
##   [5] "Smooth_muscle_cells:bronchial:vit_D"                   
##   [6] "Smooth_muscle_cells:bronchial"                         
##   [7] "Epithelial_cells:bronchial"                            
##   [8] "B_cell"                                                
##   [9] "Neutrophil"                                            
##  [10] "T_cell:CD8+_Central_memory"                            
##  [11] "T_cell:CD8+"                                           
##  [12] "T_cell:CD4+"                                           
##  [13] "T_cell:CD8+_effector_memory_RA"                        
##  [14] "T_cell:CD8+_effector_memory"                           
##  [15] "T_cell:CD8+_naive"                                     
##  [16] "Monocyte"                                              
##  [17] "Erythroblast"                                          
##  [18] "BM"                                                    
##  [19] "DC:monocyte-derived:rosiglitazone"                     
##  [20] "DC:monocyte-derived:AM580"                             
##  [21] "DC:monocyte-derived:rosiglitazone/AGN193109"           
##  [22] "DC:monocyte-derived:anti-DC-SIGN_2h"                   
##  [23] "Endothelial_cells:HUVEC"                               
##  [24] "Endothelial_cells:HUVEC:Borrelia_burgdorferi"          
##  [25] "Endothelial_cells:HUVEC:IFNg"                          
##  [26] "Endothelial_cells:lymphatic"                           
##  [27] "Endothelial_cells:HUVEC:Serum_Amyloid_A"               
##  [28] "Endothelial_cells:lymphatic:TNFa_48h"                  
##  [29] "T_cell:effector"                                       
##  [30] "T_cell:CCR10+CLA+1,25(OH)2_vit_D3/IL-12"               
##  [31] "T_cell:CCR10-CLA+1,25(OH)2_vit_D3/IL-12"               
##  [32] "Gametocytes:spermatocyte"                              
##  [33] "DC:monocyte-derived:A._fumigatus_germ_tubes_6h"        
##  [34] "Neurons:ES_cell-derived_neural_precursor"              
##  [35] "Keratinocytes"                                         
##  [36] "Keratinocytes:IL19"                                    
##  [37] "Keratinocytes:IL20"                                    
##  [38] "Keratinocytes:IL22"                                    
##  [39] "Keratinocytes:IL24"                                    
##  [40] "Keratinocytes:IL26"                                    
##  [41] "Keratinocytes:KGF"                                     
##  [42] "Keratinocytes:IFNg"                                    
##  [43] "Keratinocytes:IL1b"                                    
##  [44] "HSC_-G-CSF"                                            
##  [45] "DC:monocyte-derived:mature"                            
##  [46] "Monocyte:anti-FcgRIIB"                                 
##  [47] "Macrophage:monocyte-derived:IL-4/cntrl"                
##  [48] "Macrophage:monocyte-derived:IL-4/Dex/cntrl"            
##  [49] "Macrophage:monocyte-derived:IL-4/Dex/TGFb"             
##  [50] "Macrophage:monocyte-derived:IL-4/TGFb"                 
##  [51] "Monocyte:leukotriene_D4"                               
##  [52] "NK_cell"                                               
##  [53] "NK_cell:IL2"                                           
##  [54] "Embryonic_stem_cells"                                  
##  [55] "Tissue_stem_cells:iliac_MSC"                           
##  [56] "Chondrocytes:MSC-derived"                              
##  [57] "Osteoblasts"                                           
##  [58] "Tissue_stem_cells:BM_MSC"                              
##  [59] "Osteoblasts:BMP2"                                      
##  [60] "Tissue_stem_cells:BM_MSC:BMP2"                         
##  [61] "Tissue_stem_cells:BM_MSC:TGFb3"                        
##  [62] "DC:monocyte-derived:Poly(IC)"                          
##  [63] "DC:monocyte-derived:CD40L"                             
##  [64] "DC:monocyte-derived:Schuler_treatment"                 
##  [65] "DC:monocyte-derived:antiCD40/VAF347"                   
##  [66] "Tissue_stem_cells:dental_pulp"                         
##  [67] "T_cell:CD4+_central_memory"                            
##  [68] "T_cell:CD4+_effector_memory"                           
##  [69] "T_cell:CD4+_Naive"                                     
##  [70] "Smooth_muscle_cells:vascular"                          
##  [71] "Smooth_muscle_cells:vascular:IL-17"                    
##  [72] "Platelets"                                             
##  [73] "Epithelial_cells:bladder"                              
##  [74] "Macrophage:monocyte-derived"                           
##  [75] "Macrophage:monocyte-derived:M-CSF"                     
##  [76] "Macrophage:monocyte-derived:M-CSF/IFNg"                
##  [77] "Macrophage:monocyte-derived:M-CSF/Pam3Cys"             
##  [78] "Macrophage:monocyte-derived:M-CSF/IFNg/Pam3Cys"        
##  [79] "Macrophage:monocyte-derived:IFNa"                      
##  [80] "Gametocytes:oocyte"                                    
##  [81] "Monocyte:F._tularensis_novicida"                       
##  [82] "Endothelial_cells:HUVEC:B._anthracis_LT"               
##  [83] "B_cell:Germinal_center"                                
##  [84] "B_cell:Plasma_cell"                                    
##  [85] "B_cell:Naive"                                          
##  [86] "B_cell:Memory"                                         
##  [87] "DC:monocyte-derived:AEC-conditioned"                   
##  [88] "Tissue_stem_cells:lipoma-derived_MSC"                  
##  [89] "Tissue_stem_cells:adipose-derived_MSC_AM3"             
##  [90] "Endothelial_cells:HUVEC:FPV-infected"                  
##  [91] "Endothelial_cells:HUVEC:PR8-infected"                  
##  [92] "Endothelial_cells:HUVEC:H5N1-infected"                 
##  [93] "Macrophage:monocyte-derived:S._aureus"                 
##  [94] "Fibroblasts:foreskin"                                  
##  [95] "iPS_cells:skin_fibroblast-derived"                     
##  [96] "iPS_cells:skin_fibroblast"                             
##  [97] "T_cell:gamma-delta"                                    
##  [98] "Monocyte:CD14+"                                        
##  [99] "Macrophage:Alveolar"                                   
## [100] "Macrophage:Alveolar:B._anthacis_spores"                
## [101] "Neutrophil:inflam"                                     
## [102] "iPS_cells:PDB_fibroblasts"                             
## [103] "iPS_cells:PDB_1lox-17Puro-5"                           
## [104] "iPS_cells:PDB_1lox-17Puro-10"                          
## [105] "iPS_cells:PDB_1lox-21Puro-20"                          
## [106] "iPS_cells:PDB_1lox-21Puro-26"                          
## [107] "iPS_cells:PDB_2lox-5"                                  
## [108] "iPS_cells:PDB_2lox-22"                                 
## [109] "iPS_cells:PDB_2lox-21"                                 
## [110] "iPS_cells:PDB_2lox-17"                                 
## [111] "iPS_cells:CRL2097_foreskin"                            
## [112] "iPS_cells:CRL2097_foreskin-derived:d20_hepatic_diff"   
## [113] "iPS_cells:CRL2097_foreskin-derived:undiff."            
## [114] "B_cell:CXCR4+_centroblast"                             
## [115] "B_cell:CXCR4-_centrocyte"                              
## [116] "Endothelial_cells:HUVEC:VEGF"                          
## [117] "iPS_cells:fibroblasts"                                 
## [118] "iPS_cells:fibroblast-derived:Direct_del._reprog"       
## [119] "iPS_cells:fibroblast-derived:Retroviral_transf"        
## [120] "Endothelial_cells:lymphatic:KSHV"                      
## [121] "Endothelial_cells:blood_vessel"                        
## [122] "Monocyte:CD16-"                                        
## [123] "Monocyte:CD16+"                                        
## [124] "Tissue_stem_cells:BM_MSC:osteogenic"                   
## [125] "Hepatocytes"                                           
## [126] "Neutrophil:uropathogenic_E._coli_UTI89"                
## [127] "Neutrophil:commensal_E._coli_MG1655"                   
## [128] "MSC"                                                   
## [129] "Neuroepithelial_cell:ESC-derived"                      
## [130] "Astrocyte:Embryonic_stem_cell-derived"                 
## [131] "Endothelial_cells:HUVEC:IL-1b"                         
## [132] "HSC_CD34+"                                             
## [133] "CMP"                                                   
## [134] "GMP"                                                   
## [135] "B_cell:immature"                                       
## [136] "MEP"                                                   
## [137] "Myelocyte"                                             
## [138] "Pre-B_cell_CD34-"                                      
## [139] "Pro-B_cell_CD34+"                                      
## [140] "Pro-Myelocyte"                                         
## [141] "Smooth_muscle_cells:umbilical_vein"                    
## [142] "iPS_cells:foreskin_fibrobasts"                         
## [143] "iPS_cells:iPS:minicircle-derived"                      
## [144] "iPS_cells:adipose_stem_cells"                          
## [145] "iPS_cells:adipose_stem_cell-derived:lentiviral"        
## [146] "iPS_cells:adipose_stem_cell-derived:minicircle-derived"
## [147] "Fibroblasts:breast"                                    
## [148] "Monocyte:MCSF"                                         
## [149] "Monocyte:CXCL4"                                        
## [150] "Neurons:adrenal_medulla_cell_line"                     
## [151] "Tissue_stem_cells:CD326-CD56+"                         
## [152] "NK_cell:CD56hiCD62L+"                                  
## [153] "T_cell:Treg:Naive"                                     
## [154] "Neutrophil:LPS"                                        
## [155] "Neutrophil:GM-CSF_IFNg"                                
## [156] "Monocyte:S._typhimurium_flagellin"                     
## [157] "Neurons:Schwann_cell"
##Second check the fine cell type
pred.hesc.fine <- SingleR(test = hswound.singleR, ref = hpca.se, assay.type.test=1, labels = hpca.se$label.fine, clusters = Idents(hswound.combined.sct))
##Plot of fine cell type
plotScoreHeatmap(pred.hesc.fine)

celltype.fine <- as.data.frame(pred.hesc.fine) %>% tibble::rownames_to_column(var = "Cluster")
celltype.fine.score <- as.data.frame(pred.hesc.fine) %>% t() %>% as.data.frame() %>% tibble::rownames_to_column(var = "Cluster") %>% mutate(SingleR = "FineCellType")

celltype.fine$Cluster <- parse_integer(celltype.fine$Cluster)
markers <- markers %>% left_join(., celltype.fine[, c(1,159)], by=c("cluster" = "Cluster")) #Choose the first.label

colnames(markers)[12:13] <- c("SingleR_main", "SingleR_fine")
data.table::fwrite(markers, "s2_cleanSeurat_harmony_allSamples_markerGene.txt", sep = "\t")

celltype.scores <- rbind(celltype.main.score, celltype.fine.score) %>% dplyr::select(29, everything())
data.table::fwrite(celltype.scores, file = "s2_cleanSeurat_harmony_allSamples_markerGene_singleR_scores.txt", sep = "\t")

Sessioninfo

sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] celldex_1.2.0               SingleR_1.7.1              
##  [3] SummarizedExperiment_1.22.0 Biobase_2.52.0             
##  [5] GenomicRanges_1.44.0        GenomeInfoDb_1.28.4        
##  [7] IRanges_2.26.0              S4Vectors_0.30.0           
##  [9] BiocGenerics_0.38.0         MatrixGenerics_1.4.3       
## [11] matrixStats_0.61.0          clustree_0.4.3             
## [13] ggraph_2.0.5                pheatmap_1.0.12            
## [15] harmony_0.1.0               Rcpp_1.0.7                 
## [17] cowplot_1.1.1               sctransform_0.3.2          
## [19] patchwork_1.1.1             magrittr_2.0.1             
## [21] forcats_0.5.1               stringr_1.4.0              
## [23] dplyr_1.0.7                 purrr_0.3.4                
## [25] readr_2.0.1                 tidyr_1.1.3                
## [27] tibble_3.1.4                ggplot2_3.3.5              
## [29] tidyverse_1.3.1             SeuratObject_4.0.2         
## [31] Seurat_4.0.4               
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.2                    reticulate_1.22              
##   [3] tidyselect_1.1.1              RSQLite_2.2.8                
##   [5] AnnotationDbi_1.54.1          htmlwidgets_1.5.4            
##   [7] grid_4.1.1                    BiocParallel_1.26.2          
##   [9] Rtsne_0.15                    munsell_0.5.0                
##  [11] ScaledMatrix_1.0.0            codetools_0.2-18             
##  [13] ica_1.0-2                     future_1.22.1                
##  [15] miniUI_0.1.1.1                withr_2.4.2                  
##  [17] colorspace_2.0-2              filelock_1.0.2               
##  [19] highr_0.9                     knitr_1.34                   
##  [21] rstudioapi_0.13               ROCR_1.0-11                  
##  [23] tensor_1.5                    listenv_0.8.0                
##  [25] labeling_0.4.2                GenomeInfoDbData_1.2.6       
##  [27] polyclip_1.10-0               bit64_4.0.5                  
##  [29] farver_2.1.0                  parallelly_1.28.1            
##  [31] vctrs_0.3.8                   generics_0.1.0               
##  [33] xfun_0.26                     BiocFileCache_2.0.0          
##  [35] R6_2.5.1                      graphlayouts_0.7.1           
##  [37] rsvd_1.0.5                    cachem_1.0.6                 
##  [39] bitops_1.0-7                  spatstat.utils_2.2-0         
##  [41] DelayedArray_0.18.0           assertthat_0.2.1             
##  [43] promises_1.2.0.1              scales_1.1.1                 
##  [45] gtable_0.3.0                  beachmat_2.8.1               
##  [47] globals_0.14.0                goftest_1.2-2                
##  [49] tidygraph_1.2.0               rlang_0.4.11                 
##  [51] splines_4.1.1                 lazyeval_0.2.2               
##  [53] spatstat.geom_2.2-2           broom_0.7.9                  
##  [55] checkmate_2.0.0               BiocManager_1.30.16          
##  [57] yaml_2.2.1                    reshape2_1.4.4               
##  [59] abind_1.4-5                   modelr_0.1.8                 
##  [61] backports_1.2.1               httpuv_1.6.3                 
##  [63] tools_4.1.1                   ellipsis_0.3.2               
##  [65] spatstat.core_2.3-0           jquerylib_0.1.4              
##  [67] RColorBrewer_1.1-2            ggridges_0.5.3               
##  [69] plyr_1.8.6                    sparseMatrixStats_1.4.2      
##  [71] zlibbioc_1.38.0               RCurl_1.98-1.5               
##  [73] rpart_4.1-15                  deldir_0.2-10                
##  [75] pbapply_1.5-0                 viridis_0.6.1                
##  [77] zoo_1.8-9                     haven_2.4.3                  
##  [79] ggrepel_0.9.1                 cluster_2.1.2                
##  [81] fs_1.5.0                      data.table_1.14.0            
##  [83] scattermore_0.7               lmtest_0.9-38                
##  [85] reprex_2.0.1                  RANN_2.6.1                   
##  [87] fitdistrplus_1.1-5            hms_1.1.0                    
##  [89] mime_0.11                     evaluate_0.14                
##  [91] xtable_1.8-4                  readxl_1.3.1                 
##  [93] gridExtra_2.3                 compiler_4.1.1               
##  [95] KernSmooth_2.23-20            crayon_1.4.1                 
##  [97] htmltools_0.5.2               mgcv_1.8-36                  
##  [99] later_1.3.0                   tzdb_0.1.2                   
## [101] lubridate_1.7.10              DBI_1.1.1                    
## [103] ExperimentHub_2.0.0           tweenr_1.0.2                 
## [105] dbplyr_2.1.1                  rappdirs_0.3.3               
## [107] MASS_7.3-54                   Matrix_1.3-4                 
## [109] cli_3.0.1                     igraph_1.2.6                 
## [111] pkgconfig_2.0.3               plotly_4.9.4.1               
## [113] spatstat.sparse_2.0-0         xml2_1.3.2                   
## [115] bslib_0.3.0                   XVector_0.32.0               
## [117] rvest_1.0.1                   digest_0.6.27                
## [119] RcppAnnoy_0.0.19              Biostrings_2.60.2            
## [121] spatstat.data_2.1-0           rmarkdown_2.11               
## [123] cellranger_1.1.0              leiden_0.3.9                 
## [125] uwot_0.1.10                   DelayedMatrixStats_1.14.3    
## [127] curl_4.3.2                    shiny_1.6.0                  
## [129] lifecycle_1.0.0               nlme_3.1-153                 
## [131] jsonlite_1.7.2                BiocNeighbors_1.10.0         
## [133] viridisLite_0.4.0             fansi_0.5.0                  
## [135] pillar_1.6.2                  lattice_0.20-44              
## [137] KEGGREST_1.32.0               fastmap_1.1.0                
## [139] httr_1.4.2                    survival_3.2-13              
## [141] interactiveDisplayBase_1.30.0 glue_1.4.2                   
## [143] png_0.1-7                     BiocVersion_3.13.1           
## [145] bit_4.0.4                     ggforce_0.3.3                
## [147] stringi_1.7.4                 sass_0.4.0                   
## [149] blob_1.2.2                    AnnotationHub_3.0.1          
## [151] BiocSingular_1.8.1            memoise_2.0.0                
## [153] irlba_2.3.3                   future.apply_1.8.1